library(tidyverse) #version 1.2.1
library(caret) #version 6.0-82
library(sjmisc) #version 2.7.9
library(jtools) #version 2.0.0
library(lubridate) #version 1.7.4
library(glmnet) #version 2.0-16
library(extrafont) #version 0.17
library(modelr) #version 0.1.2
library(foreach) #version 1.4.4 
loadfonts()#to load latex fonts

#set working directory to the "Data" folder location

#### 1.1 Read and Prepare Data ####
#Note that GEZO is the Dutch abbreviation for the Dutch Health Survey
GEZO16 <- read_rds("GEZO16.rds") #find this object under folder "Data"
GEZO17 <- read_rds("GEZO17.rds") #find this object under folder "Data"


#change variable labels so that all categorical variables have clear category labels
#also force some variables to be factors
#for the 2016 GEZO data. 
GEZO16 <- GEZO16 %>%
  mutate(id = as.factor(id),
         event = if_else(event == 0, "No", "Yes"),
         event = factor(event, levels = c("No", "Yes")),
         Phase = if_else(Phase == "Remind1", "Reminder1", if_else(Phase == "Remind2", "Reminder2", "Invitation")),
         Phase = factor(Phase, levels = c("Invitation", "Reminder1", "Reminder2")),
         veldwerkperiode = as.character(veldwerkperiode),
         veldwerkperiode = recode(veldwerkperiode,
                                  "201601" = "January",
                                  "201602" = "February",
                                  "201603" = "March",
                                  "201604" = "April",
                                  "201605" = "May",
                                  "201606" = "June",
                                  "201607" = "July",
                                  "201608" = "August",
                                  "201609" = "September",
                                  "201610" = "October",
                                  "201611" = "November",
                                  "201612" = "December"),
         veldwerkperiode = factor(veldwerkperiode, levels = c("January",
                                                              "February",
                                                              "March",
                                                              "April",
                                                              "May",
                                                              "June",
                                                              "July",
                                                              "August",
                                                              "September",
                                                              "October",
                                                              "November",
                                                              "December"))) %>%
  
  rename(ID = id,
         Days = exit,
         Response = event,
         Date = Date,
         SurveyPhase = Phase,
         FieldworkPeriod = veldwerkperiode)

#for the 2017 GEZO data
GEZO17 <- GEZO17 %>%
  mutate(id = as.factor(id),
         event = if_else(event == 0, "No", "Yes"),
         event = factor(event, levels = c("No", "Yes")),
         Phase = if_else(Phase == "Remind1", "Reminder1", if_else(Phase == "Remind2", "Reminder2", "Invitation")),
         Phase = factor(Phase, levels = c("Invitation", "Reminder1", "Reminder2")),
         veldwerkperiode = as.character(veldwerkperiode),
         veldwerkperiode = recode(veldwerkperiode,
                                  "201701" = "January",
                                  "201702" = "February",
                                  "201703" = "March",
                                  "201704" = "April",
                                  "201705" = "May",
                                  "201706" = "June",
                                  "201707" = "July",
                                  "201708" = "August",
                                  "201709" = "September",
                                  "201710" = "October",
                                  "201711" = "November",
                                  "201712" = "December"),
         veldwerkperiode = factor(veldwerkperiode, levels = c("January",
                                                              "February",
                                                              "March",
                                                              "April",
                                                              "May",
                                                              "June",
                                                              "July",
                                                              "August",
                                                              "September",
                                                              "October",
                                                              "November",
                                                              "December"))) %>%
  rename(ID = id,
         Days = exit,
         Response = event,
         Date = Date,
         SurveyPhase = Phase,
         FieldworkPeriod = veldwerkperiode)



#Prepare the data with additional variables:
#Season
GEZO16 <- GEZO16 %>%
  mutate(Season = if_else(FieldworkPeriod %in% c("December", "January", "February"), "Winter",
                          if_else(FieldworkPeriod %in% c("March", "April", "May"), "Spring",
                                  if_else(FieldworkPeriod %in% c("June", "July", "August"), "Summer", "Autumn")))) %>%
  mutate(Season = factor(Season, levels = c("Spring", "Summer", "Autumn", "Winter")))

GEZO17 <- GEZO17 %>%
  mutate(Season = if_else(FieldworkPeriod %in% c("December", "January", "February"), "Winter",
                          if_else(FieldworkPeriod %in% c("March", "April", "May"), "Spring",
                                  if_else(FieldworkPeriod %in% c("June", "July", "August"), "Summer", "Autumn")))) %>%
  mutate(Season = factor(Season, levels = c("Spring", "Summer", "Autumn", "Winter")))

#Day of the Week
GEZO16 <- GEZO16 %>%
  mutate(DayOfWeek = as.character(wday(Date, week_start = 1))) %>%
  mutate(DayOfWeek = recode(DayOfWeek, 
                            "1" = "Monday",
                            "2" = "Tuesday",
                            "3" = "Wednesday",
                            "4" = "Thursday", 
                            "5" = "Friday",
                            "6" = "Saturday", 
                            "7" = "Sunday")) %>%
  mutate(DayOfWeek = factor(DayOfWeek, levels = c("Monday", "Tuesday", "Wednesday", "Thursday",
                                                  "Friday", "Saturday", "Sunday")))

GEZO17 <- GEZO17 %>%
  mutate(DayOfWeek = as.character(wday(Date, week_start = 1))) %>%
  mutate(DayOfWeek = recode(DayOfWeek, 
                            "1" = "Monday",
                            "2" = "Tuesday",
                            "3" = "Wednesday",
                            "4" = "Thursday", 
                            "5" = "Friday",
                            "6" = "Saturday", 
                            "7" = "Sunday")) %>%
  mutate(DayOfWeek = factor(DayOfWeek, levels = c("Monday", "Tuesday", "Wednesday", "Thursday",
                                                  "Friday", "Saturday", "Sunday")))


#Holidays
#find "Dutch Holidays.txt" under folder "Data"
Holidays <- read_delim("Dutch Holidays.txt", delim = "\t") %>%
  pull(Date)

GEZO16 <- GEZO16 %>%
  mutate(Holiday = if_else(Date %in% Holidays, 1, 0))

GEZO17 <- GEZO17 %>%
  mutate(Holiday = if_else(Date %in% Holidays, 1, 0))

#Weather (find the data set under "Data" folder)
Weather <- read_csv("KNMI_Weather.csv") %>%
  mutate(AveWind = AveWind/10,
         MaxWind = MaxWind/10,
         MinWind = MinWind/10,
         MaxTemp = MaxTemp/10,
         AveTemp = AveTemp/10,
         MinTemp = MinTemp/10,
         SunDur = SunDur/10,
         RainVol = RainVol/10,
         RainDur = RainDur/10,
         MaxRain = MaxRain/10,
         AveAir = AveAir/10)

GEZO16 <- GEZO16 %>%
  left_join(Weather, "Date")

GEZO17 <- GEZO17 %>%
  left_join(Weather, "Date")

#GT data (find the data set under "Data" folder)
GT1617 <- read_rds("GT1617_Full.rds")

GEZO16 <- GEZO16 %>%
  left_join(GT1617, "Date")
GEZO17 <- GEZO17 %>%
  left_join(GT1617, "Date")


#### 1.2 Short Data Format ####
#For the analysis where we model the effects of time-varying predictors 
#(without time-fixed ones), we can convert the original long data formats (i.e. person-period) 
#into short data formats, where response information is compressed into daily intervals, 
#without losing any information. The short formats also accelerate analysis.

#for the 2016 GEZO data
GEZO16_Short <- GEZO16 %>%
  mutate(ResponseInd = if_else(Response == "Yes", 1, 0)) %>%
  group_by(Date, Days, SurveyPhase, FieldworkPeriod, DayOfWeek, Holiday, Season) %>%
  summarise(Hazard = mean(ResponseInd),
            Response = sum(ResponseInd),
            Total = n()) %>%
  ungroup() %>%
  select(Date, Hazard, Response, Total, everything())

GEZO16_Short <- GEZO16_Short %>%
  left_join(Weather, "Date") %>%
  left_join(GT1617, "Date")

#for the 2017 GEZO data
GEZO17_Short <- GEZO17 %>%
  mutate(ResponseInd = if_else(Response == "Yes", 1, 0)) %>%
  group_by(Date, Days, SurveyPhase, FieldworkPeriod, DayOfWeek, Holiday, Season) %>%
  summarise(Hazard = mean(ResponseInd),
            Response = sum(ResponseInd),
            Total = n()) %>%
  ungroup() %>%
  select(Date, Hazard, Response, Total, everything())

GEZO17_Short <- GEZO17_Short %>%
  left_join(Weather, "Date") %>%
  left_join(GT1617, "Date")



#### 1.3 Split Data into Train and Validate Sets ####
#For long format data
GEZO1617 <- GEZO16 %>%
  rbind(GEZO17) %>%
  mutate(Year = year(Date)) %>%
  mutate(id = group_indices(., ID, Year)) %>%
  mutate(ID = id) %>%
  select(-id, -Year)

Valid_ID_Long <- GEZO1617 %>%
  mutate(ID = row_number()) %>%
  filter(year(Date) == 2017 & FieldworkPeriod %in% c("July","August", "September",
                                                     "October", "November", "December")) %>%
  pull(ID)

GEZO_Train_Long <- GEZO1617[-Valid_ID_Long,] #training set, long format
GEZO_Valid_Long <- GEZO1617[Valid_ID_Long,] #validation set, long format

#For short format data
GEZO1617_Short <- GEZO16_Short %>%
  rbind(GEZO17_Short)

Valid_ID_Short <- GEZO1617_Short %>%
  mutate(ID = row_number()) %>%
  filter(year(Date) == 2017 & FieldworkPeriod %in% c("July","August", "September",
                                                     "October", "November", "December")) %>%
  pull(ID)

GEZO_Train_Short <- GEZO1617_Short[-Valid_ID_Short,] #training set, short format
GEZO_Valid_Short <- GEZO1617_Short[Valid_ID_Short,] #validation set, long format




#### 2. Adaptive Lasso Model ####

#### 2.1 Algorithm Adaptive Lasso ----
Adaptive_Lasso <- function(X_matrix, Y_matrix, seed) {
  set.seed(seed)
  
  ridge_cv <- cv.glmnet(x = X_matrix, 
                        y = Y_matrix, 
                        alpha = 0,
                        family = "binomial", 
                        type.measure = "deviance",
                        nfolds = 10)
  
  best_ridge_coef <- as.numeric(coef(ridge_cv, s = ridge_cv$lambda.min))[-1]
  
  set.seed(seed)
  lasso_cv <- cv.glmnet(x = X_matrix, 
                        y = Y_matrix, 
                        alpha = 1,
                        family = "binomial", 
                        type.measure = "deviance",
                        nfolds = 10,
                        penalty.factor = 1 / abs(best_ridge_coef),
                        ## prevalidated array is returned
                        keep = TRUE)
  
  return(lasso_cv)
}

#### 2.2 Baseline Model ----
#Prepare for the input matrices
FullRankDummy <- dummyVars(formula = Hazard ~ . -Season -FieldworkPeriod, 
                           GEZO_Train_Short, 
                           fullRank = TRUE,
                           levelsOnly = TRUE)

GEZO_Train_Short_Dummy <- predict(FullRankDummy, GEZO_Train_Short) %>%
  as_tibble()

#predictor matrix
X_Lasso <- as.matrix(GEZO_Train_Short_Dummy[,c(-1,-2,-3)])
#outcome matrix
Y_Lasso <- GEZO_Train_Short %>%
  mutate(NonResponse = Total - Response) %>%
  select(NonResponse, Response) %>%
  as.matrix()

#the baseline model
X_Lasso_Base <- as.matrix(GEZO_Train_Short_Dummy[,c(4,5,6)])

Lasso_Base <- Adaptive_Lasso(X_matrix = X_Lasso_Base,
                             Y_matrix = Y_Lasso,
                             seed = 6161138) #this number is Qixiang Fang's student number
plot(Lasso_Base)
coef(Lasso_Base, s = Lasso_Base$lambda.min)

#### 2.3 Full Model ----
Lasso_Full <- Adaptive_Lasso(X_matrix = X_Lasso,
                           Y_matrix = Y_Lasso,
                           seed = 6161138)
plot(Lasso_Full)
coef(Lasso_Full, s = Lasso_Full$lambda.min)


#### 2.4 Interaction Model ----
#prepare input matrices
Formula_Interact <- formula(Total ~ (Days + Reminder1 + Reminder2)*(Tuesday+Wednesday+Thursday+Friday+Saturday+Sunday+Holiday+AveWind+MaxWind+MinWind+AveTemp+MinTemp+MaxTemp+SunDur+SunPer+RainVol+RainDur+MaxRain+AveAir+MaxAir+MinAir+MinVisi+MaxVisi+AveCloud+AveHumid+MaxHumid+MinHumid+DataLeak+Depression+Festival+TrafficJam+Flu+Hacking+HayFever+Influenza+Terrorist+Cold))

FullRankDummy_Lasso <- dummyVars(formula = Formula_Interact, 
                                 GEZO_Train_Short_Dummy, 
                                 fullRank = T,
                                 levelsOnly = TRUE)

GEZO_Train_Short_Dummy_Interact <- predict(FullRankDummy_Lasso, GEZO_Train_Short_Dummy) %>%
  as_tibble()

#the predictor matrix
X_Lasso_Interact <- as.matrix(GEZO_Train_Short_Dummy_Interact)

#run the interaction model
Lasso_Interact <- Adaptive_Lasso(X_matrix = X_Lasso_Interact,
                             Y_matrix = Y_Lasso,
                             seed = 6161138)
plot(Lasso_Interact)
coef(Lasso_Interact, s = Lasso_Interact$lambda.min)


#### 3. Lasso Performance  ----
#### 3.1 Prepare Validation Data Sets ----
#Prepare the validation data sets in the correct format: for the baseline and full models
FullRankDummy_Valid_Long <- dummyVars(formula = Response ~ . -ID -Date -Season -FieldworkPeriod, 
                                      GEZO_Valid_Long, 
                                      fullRank = TRUE,
                                      levelsOnly = TRUE)

GEZO_Valid_Long_Dummy <- predict(FullRankDummy_Valid_Long, GEZO_Valid_Long) %>%
  as_tibble() %>%
  mutate(ID = GEZO_Valid_Long$ID) %>%
  select(ID, everything())

FullRankDummy_Valid_Short <- dummyVars(formula = Response ~ . -Date -FieldworkPeriod -Season, 
                                       GEZO_Valid_Short, 
                                       fullRank = TRUE,
                                       levelsOnly = TRUE)



GEZO_Valid_Short_Dummy <- predict(FullRankDummy_Valid_Short, GEZO_Valid_Short) %>%
  as_tibble()

#validation set in short data format
GEZO_Valid_Short_Lasso_Matrix <- GEZO_Valid_Short_Dummy %>%
  select(-Total) %>%
  model.matrix(formula(Hazard ~.-1), data = .)

#validation set in long data format
GEZO_Valid_Long_Lasso_Matrix <- GEZO_Valid_Long_Dummy %>%
  model.matrix(formula(ID ~.-1), data = .)


#Prepare the validation data sets in the correct format: for the interaction model
#validation set in long data format
FullRankDummy_Valid_Long_Interact <- dummyVars(formula = ID ~ (Days + Reminder1 + Reminder2) * (Tuesday + Wednesday + 
                                                                                                     Thursday + Friday + Saturday + Sunday + Holiday + AveWind + 
                                                                                                     MaxWind + MinWind + AveTemp + MinTemp + MaxTemp + SunDur + 
                                                                                                     SunPer + RainVol + RainDur + MaxRain + AveAir + MaxAir + 
                                                                                                     MinAir + MinVisi + MaxVisi + AveCloud + AveHumid + MaxHumid + 
                                                                                                     MinHumid + DataLeak + Depression + Festival + TrafficJam + 
                                                                                                     Flu + Hacking + HayFever + Influenza + Terrorist + Cold), 
                                      GEZO_Valid_Long_Dummy, 
                                      fullRank = TRUE,
                                      levelsOnly = TRUE)


GEZO_Valid_Long_Dummy_Interact <- predict(FullRankDummy_Valid_Long_Interact, GEZO_Valid_Long_Dummy) %>%
  as_tibble()

GEZO_Valid_Long_Lasso_Matrix_Interact <- as.matrix(GEZO_Valid_Long_Dummy_Interact)


#validation set in short data format
FullRankDummy_Valid_Short_Interact <- dummyVars(formula = Formula_Interact, 
                                       GEZO_Valid_Short_Dummy, 
                                       fullRank = TRUE,
                                       levelsOnly = TRUE)


GEZO_Valid_Short_Dummy_Interact <- predict(FullRankDummy_Valid_Short_Interact, GEZO_Valid_Short_Dummy) %>%
  as_tibble()

GEZO_Valid_Short_Lasso_Matrix_Interact <- as.matrix(GEZO_Valid_Short_Dummy_Interact)


#### 3.2 RMSE ----
#function to compute RMSE
ComputeRMSE <- function(model, newdata) {
  
  obs_hazard <- pull(GEZO_Valid_Short, Hazard)
  
  if (class(model)[1] == "cv.glmnet") {
    newdata <- as.matrix(newdata)
    pred_hazard <- predict(model, newdata, type = "response", s = model$lambda.min)
  } else {
    pred_hazard <- predict(model, newdata, type = "response")
  }
  
  error <- tibble(obs = obs_hazard, pred = pred_hazard) %>%
    mutate(sqrerror = (obs - pred)^2) %>%
    summarise(RMSR = sqrt(mean(sqrerror))) %>%
    pull(RMSR)
  
  return(error)
}

ComputeRMSE(Lasso_Full, as_tibble(GEZO_Valid_Short_Lasso_Matrix)) 
#0.00527493 for the full lasso model

ComputeRMSE(Lasso_Base, as_tibble(GEZO_Valid_Short_Lasso_Matrix[,c(1,2,3)])) 
#0.005528246 for the baseline lasso model

ComputeRMSE(Lasso_Interact, as_tibble(GEZO_Valid_Short_Lasso_Matrix_Interact))
#0.00473863 for the interaction lasso model


#### 4. Variable Importance ----
#extract names of variables with non-zero coefficients in the full lasso model
VarImp_VarList_Lasso <- coef(Lasso_Full, s = Lasso_Full$lambda.min) %>%
  as.matrix() %>%
  as_tibble(rownames = "Variable") %>%
  filter(`1` != 0) %>%
  pull(Variable) %>%
  setdiff(., "(Intercept)")

#function to compute variable importance
PermuteError_Lasso <- function(model, newdata, variable, n, seed, measure = "RMSE"){
  if (measure == "RMSE") {
    rmse_full <- ComputeRMSE(model, newdata)
    
    set.seed(seed)
    perm_data <- permute(as_tibble(newdata), n, variable)
    perm_list <- map(perm_data$perm, ~as_tibble(.))
    perm_RMSE <- lapply(perm_list, ComputeRMSE, model = model)
    
    results_RMSE <- list(origError = rmse_full,
                         permError = perm_RMSE,
                         variable = variable)
    return(results_RMSE)
  } else {
    PD_full <- ComputePD(model, newdata)
    
    set.seed(seed)
    perm_data <- permute(as_tibble(newdata), n, variable)
    perm_list <- map(perm_data$perm, ~as_tibble(.))
    perm_PD <- lapply(perm_list, ComputePD, model = model)
    
    results_PD <- list(origError = PD_full,
                       permError = perm_PD,
                       variable = variable)
    return(results_PD)
  }
}

#compute variance importance
PermuteResult_Lasso <- foreach(Var = VarImp_VarList_Lasso) %do%
  PermuteError_Lasso(model = Lasso_Full, 
               newdata = GEZO_Valid_Short_Lasso_Matrix, 
               variable = Var,
               n = 20,
               seed = 6161138, 
               measure = "RMSE")

Permute_Tibble_Lasso <- tibble(Variable = map(PermuteResult_Lasso, "variable"),
                         PermError = map(PermuteResult_Lasso, "permError"),
                         OrigError = map(PermuteResult_Lasso, "origError")) %>%
  unnest(Variable, OrigError) %>%
  unnest(PermError) %>%
  unnest(PermError)

#### Plot: Variable Importance Full Model ####
#assign full names of the non-zero variables to a vector
NonZero_Var_VarImp_Label <- c("Days",
                              "Day of a Week: Saturday",
                              "Day of a Week: Sunday",
                              "Survey Phase: Reminder 2",
                              "Day of a Week: Wednesday",
                              "Survey Phase: Reminder 1",
                              "Temperature (max.)",
                              "Day of a Week: Friday",
                              "Sunshine Duration",
                              "Cloudiness (avg.)",
                              "Day of a Week: Tuesday",
                              "Precipitation Volume",
                              "Precipitation Volume (max. hr.)",
                              "Holiday",
                              "Air Pressure (avg.)",
                              "Terrorist Attacks",
                              "Maximum Visibility", 
                              "Disease Outbreaks: Cold",
                              "Disease Outbreaks: Depression",
                              "Public Outdoor Engagement: Traffic Jam",
                              "Day of a Week: Thursday")

#plot: variable importance
Permute_Tibble_Lasso %>%
  mutate(VarImp = PermError/OrigError) %>%
  group_by(Variable) %>%
  summarise(VarImp = mean(VarImp)) %>%
  mutate(Variable = fct_reorder(Variable, VarImp)) %>%
  mutate(Contribution = if_else(VarImp < 1, "Negative", "Positive"),
         Contribution = factor(Contribution, levels = c("Positive", "Negative"))) %>%
  ggplot(aes(x = Variable, y = VarImp, fill = Contribution)) +
  geom_col() +
  geom_hline(yintercept = 1,
             lty = 1,
             color = "black",
             size = 0.5) +
  theme_bw(base_size = 14) +
  scale_fill_grey() +
  theme(text=element_text(family="LM Roman 10", size = 12),
        legend.position = "top",
        legend.box = "vertical",
        axis.title.y = element_blank()) +
  scale_x_discrete(labels = rev(NonZero_Var_VarImp_Label)) +
  scale_y_continuous(breaks = seq(0.90, 1.50, 0.1)) +
  coord_flip(ylim = c(0.90, 1.50)) +
  ylab("Variable Importance")

#save the plot
ggsave("VarImp.pdf", width = 8, height = 5.5, dpi = 500)


#### Plot: First Three Data Collection Periods (3 DCPs) ####
GEZO_Train_Short %>%
  mutate(Year = year(Date)) %>%
  group_by(FieldworkPeriod, Year) %>%
  mutate(Total = max(Total),
         Response = cumsum(Response),
         Rate = Response/Total) %>%
  filter(FieldworkPeriod %in% c("January", "February", "March") & Year == 2016) %>%
  ggplot(aes(x = Date, y = Rate)) +
  geom_point(aes(shape = SurveyPhase),size = 2) +
  geom_line(aes(
    group = FieldworkPeriod),
    size = 1) +
  theme(text=element_text(family="LM Roman 10", size = 12),
        legend.position = "top",
        legend.box = "vertical") +
  ylab("Cumulative Response Rate") +
  labs(shape = "Survey Phase") +
  scale_shape_discrete(labels = c("Invitation",
                                  "Reminder 1",
                                  "Reminder 2")) +
  scale_x_date(date_labels="%b %d", date_breaks  = "1 month")

#save plot
ggsave("3DCP.pdf", width = 5, height = 5, dpi = 500)


#### Plot: Coefs of Full Model ####
#sd of the continuous variables
SD_Continuous <- GEZO_Train_Short %>%
  select(Days, AveWind:Cold) %>%
  summarise_all(sd) %>%
  gather(Variable, SD)

#variable list
Var_List_Lasso <- colnames(GEZO_Valid_Short_Lasso_Matrix)

#sd of all the variables
SD_All <- tibble(Variable = Var_List_Lasso) %>%
  left_join(SD_Continuous, by = "Variable") %>%
  mutate(SD = tidyr::replace_na(SD, 1))

#compute standardised exponentiated model estimates
Lasso_Summ <- coef(Lasso_Full, s = Lasso_Full$lambda.min) %>%
  as.matrix() %>%
  as_tibble(rownames = "Variable") %>%
  rename(Est = 2) %>%
  filter(Variable != "(Intercept)") %>%
  left_join(SD_All, by = "Variable") %>%
  mutate(Est = Est * SD) %>%
  mutate_at(vars(Est), exp) %>%
  select(Variable, Est) %>%
  mutate(Model = "Lasso") %>%
  rename(Exp = Est)

#assign variable labels to a vector
VarAll_Label <- c("Days",
                  "Survey Phase: Reminder 1",
                  "Survey Phase: Reminder 2",
                  "Day of a Week: Tuesday",
                  "Day of a Week: Wednesday",
                  "Day of a Week: Thursday",
                  "Day of a Week: Friday",
                  "Day of a Week: Saturday",
                  "Day of a Week: Sunday",
                  "Holiday",
                  "Wind Speed (avg. hr.)",
                  "Wind Speed (max. hr.)",
                  "Wind Speed (min. hr.)",
                  "Temperature (avg.)",
                  "Temperature (max.)",
                  "Temperature (min.)",
                  "Sunshine Duration",
                  "Sunshine Percentage",
                  "Precipitation Volume",
                  "Precipitation Duration",
                  "Precipitation Volume (max. hr.)",
                  "Air Pressure (avg.)",
                  "Air Pressure (max.)",
                  "Air Pressure (min.)",
                  "Visibility (max.)",
                  "Visibility (min.)",
                  "Cloudiness (avg.)",
                  "Humidity (avg.)",
                  "Humidity (max.)",
                  "Humidity (min.)",
                  "Disease Outbreaks: Depression",
                  "Disease Outbreaks: Flu",
                  "Disease Outbreaks: Influenza",
                  "Disease Outbreaks: Cold",
                  "Disease Outbreaks: Hay Fever",
                  "Privacy Concern: Data Leak",
                  "Privacy Concern: Hacking",
                  "Public Outdoor Engagement: Festival",
                  "Public Outdoor Engagement: Traffic Jam",
                  "Terrorist Attacks")

#assign variable order to a vector
Var_List_Lasso_Levels <- c(Var_List_Lasso[1:14],
                           "MaxTemp", "MinTemp", 
                           Var_List_Lasso[17:24],
                           "MaxVisi", "MinVisi",
                           Var_List_Lasso[27:30],
                           "Depression",
                           "Flu",
                           "Influenza",
                           "Cold",
                           "HayFever",
                           "DataLeak",
                           "Hacking",
                           "Festival",
                           "TrafficJam",
                           "Terrorist")

#a vector indicating whether a variable is non-zero or not in the full lasso model
Lasso_NonZero_Var <- Lasso_Summ %>%
  filter(Exp != 1) %>%
  pull(Variable)

#a vector indicating the position of non-zero variables
Lasso_NonZero_Var_Index <- Var_List_Lasso_Levels %in% Lasso_NonZero_Var

#plot
Lasso_Summ %>%
  filter(Exp != 1) %>%
  mutate(Variable = factor(Variable, levels = Var_List_Lasso_Levels[Lasso_NonZero_Var_Index])) %>%
  ggplot(aes(x = Variable, y = Exp)) +
  geom_point(size = 3) +
  geom_hline(yintercept = 1) +
  geom_text(aes(label = round(Exp,3)), hjust = -0.2, vjust = 0.1) +
  theme_bw(base_size = 14) +
  theme(text=element_text(family="LM Roman 10", size = 12),
        legend.position = "top",
        legend.box = "vertical") +
  scale_x_discrete(limits = rev(Var_List_Lasso_Levels[Lasso_NonZero_Var_Index]),
                   labels = rev(VarAll_Label[Lasso_NonZero_Var_Index])) +
  scale_y_continuous(breaks = seq(0.30, 1.30, 0.1)) +
  coord_flip(ylim = c(0.30, 1.30)) +
  ylab("Exponentiated Standardised Coefficient Estimate")

#save plot
ggsave("AllCoef.pdf", width = 8, height = 5.5, dpi = 500)



#### Plot: Predicted Cumulative Response Rates ####
#predicted response hazards based on the full lasso model
Pred_Haz_Lasso_Full_Long <- predict(Lasso_Full, 
                                    GEZO_Valid_Long_Lasso_Matrix, 
                                     s = Lasso_Full$lambda.min,
                                     type = "response")

#predicted response hazards based on the baseline lasso model
Pred_Haz_Lasso_Base_Long <- predict(Lasso_Base, 
                                     GEZO_Valid_Long_Lasso_Matrix[, c(1,2,3)], 
                                     s = Lasso_Base$lambda.min,
                                     type = "response")

#predicted response hazards based on the interaction lasso model
Pred_Haz_Lasso_Interact_Long <- predict(Lasso_Interact, 
                                         GEZO_Valid_Long_Lasso_Matrix_Interact, 
                                         s = Lasso_Interact$lambda.min,
                                         type = "response")



#cumulative by phase and days
#calculate predicted cumulative response rates by phase and days
Cum_Hazard_TB_Ave <- GEZO_Valid_Long %>%
  select(Response, Days, SurveyPhase) %>%
  mutate(SurveyPhase = recode_factor(SurveyPhase, 
                                     "Invitation" = "Invitation",
                                     "Reminder1" = "Reminder 1",
                                     "Reminder2" = "Reminder 2")) %>%
  mutate(Baseline = as.vector(Pred_Haz_Lasso_Base_Long),
         Full = as.vector(Pred_Haz_Lasso_Full_Long),
         Interaction = as.vector(Pred_Haz_Lasso_Interact_Long),
         Response = if_else(Response == "Yes", 1, 0)) %>%
  rename(Observed = Response) %>%
  gather(Model, Hazard, -Days, -SurveyPhase) %>%
  group_by(Days, SurveyPhase, Model) %>%
  summarise(Hazard = mean(Hazard)) %>%
  group_by(SurveyPhase, Model) %>%
  mutate(Surv = cumprod(1-Hazard),
         Rate = 1-Surv) %>%
  ungroup()

#plot
Cum_Hazard_TB_Ave %>%
  mutate(Model = factor(Model, 
                        levels = c("Baseline", "Full", "Interaction", "Observed"))) %>%
  ggplot(aes(x = Days, y = Rate)) +
  geom_point(aes(shape = Model,
                 color = Model,
                 size = Model)) +
  geom_line(aes(color = Model),
            data = filter(Cum_Hazard_TB_Ave, Model != "Observed")) +
  scale_color_grey() +
  scale_fill_grey() +
  theme_bw() +
  theme(text=element_text(family="LM Roman 10", size = 12),
        panel.grid = element_blank(),
        legend.position = "top") +
  scale_shape_manual(values=c(19, 17, 15, 8))+
  scale_size_manual(values = c(2,2,2,2,3)) +
  facet_grid(. ~ SurveyPhase, scale = "free") +
  ylab("Cumulative Response Rate")

#save plot
ggsave("CumByPhase.pdf", width = 9, height = 5, dpi = 500)
